data %>%
ggplot(aes(x = ind_var,
y = dep_var)) +
geom_point(aes(colour = factor(grouping_var)),
size = 1.5)How to visualise data in R
A compendium of code for visualising data in R (primarily using ggplot2).
Note: this guide uses built-in datasets from the following, amongst others:
1 Structure of a ggplot
- Aesthetics describe mapping between visual elements and variables in the data, e.g. x-axis may be mapped to “time_point”, while colour may be mapped to “gender”.
- Geoms are the type of visual ‘marks’ on a plot such as lines, points, or bars: they are geometrical objects used to represent data.
If you want to add a global attribute (e.g. to apply to all points, lines, or whatever), specify this outside of aes() because it is not a mapping (it doesn’t relate to something in the dataframe itself).
2 Plot types
2.1 Points
mpg %>%
ggplot(aes(x = displ, y = hwy)) +
geom_point(size = 1.5,
position = "identity") This masks the picture because there are overlapping points. Changing position to "jitter":
Adding mappings for colour and shape:
mpg %>%
ggplot(aes(x = displ, y = hwy)) +
geom_point(aes(colour = class,
shape = class),
size = 1.5,
position = "jitter") Note that this can also be written as:
mpg %>%
ggplot() +
geom_point(aes(x = displ, y = hwy,
colour = class,
shape = class),
size = 1.5)2.1.1 Point shapes
Note: the fill attribute can be changed to any colour for shapes 21 to 24.
2.2 Lines
gcookbook::countries %>%
filter(Name %in% c("United Kingdom", "Ireland") & Year > 1980) %>%
ggplot(aes(x=Year, y=GDP)) +
geom_line(aes(colour = Name,
linetype = Name),
linewidth = 0.8)# to add arrows to the ends of lines:
# arrow = arrow(length = unit(0.25, "cm"), ends = "last", type = "closed")Adding a line of best fit
gcookbook::heightweight %>%
ggplot(aes(x=heightIn, y=weightLb)) +
geom_point(aes(colour=sex)) +
geom_smooth(method="lm", aes(colour=sex))Use geom_segment to add lines from points to fitted regression slope
ggplot(df, aes(x=x, y=y)) +
geom_point() +
theme_classic() +
geom_smooth(method="lm", se=F) +
geom_segment(aes(x = x, y = y,
xend = x, yend = Fitted), linetype = "dashed")If you want to sum a value across factor levels using a linegraph, you need to use stat_summary:
df %>%
ggplot(aes(x=year, y=value)) +
stat_summary(fun = "sum", geom = "line")2.2.1 Linetypes
2.3 Bars
2.3.1 Simple bar chart
geom_colleaves the data as it is and merely represents values already in the dataframegeom_barusesstat_countto derive new values from the data. As a result, geom_bar doesn’t expect a y-value, but if you provide one then you are telling it to forgo the aggregation it would have done anyway with stat_count.
Using stat = "count"
palmerpenguins::penguins %>%
ggplot(aes(x=species, fill=species)) +
geom_bar(stat="count",
width = 0.8,
show.legend = F)Using stat = "identity"
gcookbook::drunk %>%
pivot_longer(c(2:6)) %>%
group_by(sex) %>%
summarise(felonies = sum(value)) %>%
ggplot(aes(x = sex, y = felonies)) +
geom_bar(stat = "identity", width = 0.8)You should also use stat_identity if you want to reorder bars in descending order.
palmerpenguins::penguins %>%
group_by(species) %>%
summarise(n = n()) %>%
ggplot(aes(reorder(species, -n), n)) +
geom_bar(stat = "identity", width = 0.8, aes(fill = species),
show.legend = F)2.3.2 Dodged bar chart
palmerpenguins::penguins %>%
ggplot(aes(x = island, fill = sex)) +
geom_bar(position = position_dodge(),
width = 0.8)Note that the above graph contains NAs. To remove these, use the subset function:
penguins %>%
drop_na(sex) %>%
ggplot(aes(x = island, fill = sex)) + ...2.3.3 Stacked bar chart
palmerpenguins::penguins %>%
ggplot(aes(x = island, fill = sex)) +
geom_bar(position = position_stack(),
width = 0.8)2.4 Proportional stacked bar
palmerpenguins::penguins %>%
ggplot(aes(x = island, fill = sex)) +
geom_bar(position = position_fill(),
width= 0.8) +
ylab("Proportion")Text is dealt with later on, but to add figures to the above figure:
palmerpenguins::penguins %>%
group_by(island, sex) %>%
summarise(n = n()) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = island, y = prop, fill = sex)) +
geom_bar(stat = "identity", width = 0.8) +
geom_text(aes(label =
paste0(sprintf("%.1f", prop * 100), "%")),
position = position_stack(vjust = 0.5),
size = 3.5) +
ylab("Proportion")# note: to change decimal places, "%.0f", "%.2f" etc2.5 Boxplot
palmerpenguins::penguins %>%
ggplot(aes(x = species, y = bill_length_mm)) +
stat_boxplot(
geom ='errorbar',
width = 0.5) +
geom_boxplot(
notch = F,
outlier.color = "red",
outlier.size = 3
)2.6 Violin plot
palmerpenguins::penguins %>%
ggplot(aes(x=species, y=bill_length_mm)) +
geom_violin(aes(fill = species),
draw_quantiles = c(0.5)) # 0.5 is median2.7 Dotplot
For when there isn’t a huge amount of data. Each dot represents a single observation.
palmerpenguins::penguins %>%
sample_n(125, replace=F) %>%
ggplot(aes(x=body_mass_g)) +
geom_dotplot(dotsize = 1, width = 1)2.8 Histogram
For when there’s more data, or binning is better.
palmerpenguins::penguins %>%
ggplot(aes(x=body_mass_g)) +
geom_histogram(binwidth = 100,
fill = "grey",
colour = "black")# multiple groups
palmerpenguins::penguins %>%
ggplot(aes(x=body_mass_g)) +
geom_histogram(aes(fill=species),
binwidth = 50,
alpha = 0.7)Example of a function to iterate through multiple histograms, by group:
# firstly make a vector of all the variable names you want to plot
vars = c("bill_length_mm", "flipper_length_mm")
# then create a custom graphing function
hist_fun = function(data, x, y) {
ggplot(data, aes(x = .data[[x]], fill = .data[[y]]) ) +
geom_histogram(alpha=0.5, position = "identity") +
theme_bw() +
ggtitle(x)
}
# use purrr::map to cycle through vars to produce plots, with a constant grouping factor
map(vars, ~ hist_fun(data = palmerpenguins::penguins, .x, "sex") )[[1]]
[[2]]
2.9 Density
For large amounts of data. Use bw argument to change bandwidth (e.g. if you want more smoothing).
ggplot2::diamonds %>%
ggplot(aes(x = carat)) +
geom_density(fill = "lightblue",
alpha=0.8)2.10 Pie
palmerpenguins::penguins %>%
count(species) %>%
ggplot(aes(x = "", y = n, fill = species)) +
geom_col(color = "black") +
coord_polar(theta = "y") +
geom_text(aes(label = n),
position = position_stack(vjust = 0.5)) +
theme_void()3 Plotting statistical summary data
It’s possible to plot different statistical summaries within ggplot2, for instance the median.
palmerpenguins::penguins %>%
ggplot(aes(x=species, y=flipper_length_mm)) +
geom_bar(fun = "median",
stat = "summary")However, a more powerful method is to use stat_summary.
palmerpenguins::penguins %>%
ggplot(aes(x=species, y=flipper_length_mm)) +
stat_summary(fun = "mean", geom = "point") +
stat_summary(fun = "mean", geom = "line", aes(group=1)) Add confidence intervals (requires the Hmisc package):
mpg %>%
ggplot(aes(class, hwy)) +
stat_summary(fun.data = "mean_cl_normal",
fun.args = list(
conf.int = .95
),
geom = "errorbar",
width = .4) +
stat_summary(fun = "mean",
geom = "point")3.1 Pointrange
palmerpenguins::penguins %>%
ggplot() +
geom_pointrange(mapping = aes(x = species, y = flipper_length_mm),
stat = "summary",
fun.ymin = min,
fun.ymax = max,
fun.y = median)4 Working with text
4.1 Plot title
p + ggtitle("Main plot title", subtitle = "Plot subtitle")
# centre align plot title
p + theme(plot.title = element_text(hjust = 0.5))
# change plot title size
p + theme(plot.title = element_text(size = 18))4.2 Adding text to charts
palmerpenguins::penguins %>%
group_by(species) %>%
summarise(n = n()) %>%
ggplot(aes(x = species, y = n)) +
geom_bar(stat = "identity", width = 0.8) +
geom_text(
aes(label = n,
vjust = -0.5),
size = 3.5)5 Axes
Note that the same functions are used for x or y axes - adjust accordingly.
# custom axis titles
p + xlab("...")
p + ylab("...")
# remove axis title
p + theme(axis.title.x = element_blank())
# change axis title text size
p + theme(axis.title.x = element_text(size = 14))
# rotate x- axis title text 45 degrees
p + theme(axis.title.x = element_text(angle = 45, vjust = 0.5))
# rotate y-axis to 0 degrees (to be read horizontally not vertically)
p + theme(axis.title.y = element_text(angle = 0, vjust = 0.5))
# remove tick marks
p + theme(axis.ticks.x = element_blank())5.1 Axis formatting options
# percent labels with breaks of 10-100%
p + scale_y_continuous(labels = scales::label_percent(accuracy = 1), breaks = (0:10)/10)
# currency suffix
p + scale_y_continuous(labels = scales::label_dollar(prefix = "£"))
# can also specify suffix, big.mark = ",", and decimal.mark = "."
# dates and times
p + scale_y_continuous(labels = label_date(format = "%Y-%m-%d"))
p + scale_y_continuous(labels = label_time(format = "%H:%M:%S"))
# thousands separator
p + scale_y_continuous(labels = label_comma(big.mark = ","))5.2 Limits and ranges of axes
# xlim, ylim, coord_cartesian (doesn't clip data)
# scale_x_continuous(limits = c(0, 100))A tip: if you want a y-axis to begin at zero but don’t know the maximum, you can do this:
p + expand_limits(y = 0)6 Themes
To add a pre-defined theme to a plot: p + theme_*().
Tips:
- Place the
themeargument early on in the plot sequence if you want to adjust other features e.g. axis attributes, because otherwisethemewill override these. - Set a global theme:
theme_set(theme_classic(base_size = 16)).
6.1 A custom theme I like
p + theme(axis.line = element_line(colour="black"),
panel.grid.major.y = element_line(colour="grey90"),
panel.grid.major.x = element_blank(),
panel.background = element_rect(fill = "white",
colour = NA))7 Legends
You can control the palette, breaks, labels, and name. E.g. if the factor labels are too long, you can shorten them. Use a separate call to contol other attributes like linetype or fill.
gcookbook::countries %>%
filter(str_detect(Name, "United")) %>%
ggplot(aes(x = Year, y = GDP)) +
geom_line(aes(colour = Name), linewidth = 0.8) +
scale_colour_manual(values = c(RColorBrewer::brewer.pal(3, "Set2")),
breaks = c("United Kingdom", "United Arab Emirates", "United States"),
labels = c("UK", "UAE", "US"),
name = "Country")7.1 Direct labels
Sometimes a legend isn’t necessary or the right aesthetic choice. Instead, it’s possible to append labels directly to dots or lines using directlabels or ggrepel.
Using directlabels and geom_dl. Note that other method options include first.points and last.qp which adjusts the size of the text automatically). This also requires clipping to be turned off and for the plot margins to be extended (otherwise text won’t be displayed properly).
df %>%
ggplot(aes(x = time_period, y = total_exam_entries)) +
geom_line(aes(colour = subject), linewidth = 0.8) +
ylab("Entries") +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle=270, vjust = 0.5),
legend.position = "none",
axis.line = element_line(colour="black"),
panel.grid.minor = element_blank()) +
scale_y_continuous(labels = scales::label_comma(big.mark = ","),
limits = c(0,80000)) +
scale_x_continuous(breaks = c(200910, 201011, 201112, 201213, 201314, 201415, 201516, 201617,
201718, 201819, 201920, 202021, 202122, 202223),
labels = c("2009/10", "2010/11", "2011/12", "2012/13", "2013/14", "2014/15",
"2015/16", "2016/17", "2017/18", "2018/19", "2019/20", "2020/21",
"2021/22", "2022/23")) +
geom_dl(aes(label=subject), method=list("last.points", "bumpup", cex=0.8)) +
coord_cartesian(clip="off") +
theme(plot.margin = unit(c(1,4,1,1), "lines")) Using geom_label_repel. Note that this requires a bit more wrangling to make sure only the final points are displayed (using the data argument) - otherwise every single point will be labelled.
df %>%
ggplot(aes(x = time_period, y = total_exam_entries, label = subject)) +
geom_line(aes(colour = subject), linewidth = 0.8) +
ylab("Entries") +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle=270, vjust = 0.5),
legend.position = "none",
axis.line = element_line(colour="black"),
panel.grid.minor = element_blank()) +
scale_y_continuous(labels = scales::label_comma(big.mark = ","),
limits = c(0,80000)) +
scale_x_continuous(breaks = c(200910, 201011, 201112, 201213, 201314, 201415, 201516, 201617,
201718, 201819, 201920, 202021, 202122, 202223),
labels = c("2009/10", "2010/11", "2011/12", "2012/13", "2013/14", "2014/15",
"2015/16", "2016/17", "2017/18", "2018/19", "2019/20", "2020/21",
"2021/22", "2022/23")) +
coord_cartesian(clip = "off") +
geom_label_repel(aes(label = subject),
label.padding = .15,
data = df %>% group_by(subject) %>% filter(time_period == max(time_period)),
size = 3, hjust = 0.5, nudge_x=0.5)8 Other things
8.1 Flip coordinates
ggplot2::diamonds %>%
ggplot(aes(x=cut, y=carat)) +
geom_violin() +
coord_flip()9 Colour palettes
9.1 Manual
# Sequential palettes
blues = c("#104F75", "#407291", "#7095AC", "#9FB9C8", "#CFDCE3")
reds = c("#8A2529", "#A15154", "#B97C7F", "#D0A8A9", "#E8D3D4")
oranges = c("#E87D1E", "#ED974B", "#F1B178", "#F6CBA5", "#FAE5D2")
yellows = c("#C2A204", "#CEB536", "#DAC768", "#E7DA87", "#F3ECCD")
greens = c("#004712", "#336C41", "#669171", "#99B5A0", "#CFDABD")
purples = c("#260859", "#51397A", "#7D6B9B", "#A89CBD", "#D4CEDE")
show_col(c(blues, reds, oranges, yellows, greens, purples), ncol=5, cex_label=0.7)scale_*_manual for manually-created palettes
# example of sequential. Tip: use rev() to reverse the order of colours
ggplot2::diamonds %>%
filter(price < 6000) %>%
ggplot(aes(x = price, fill = cut)) +
geom_histogram(position = "dodge", binwidth = 1000) +
scale_fill_manual(values = rev(yellows))An example of how to specify manual linetype and colour. Note that these must use the same breaks in order to be presented as a single legend (as opposed to separate legends for linetype and for colour):
scale_linetype_manual(name = "",
values = c("solid", "dashed"),
breaks = c("pay", "median_after_tax"),
labels = c("Pay", "Median UK")) +
scale_colour_manual(name = "",
values = c("black", "red"),
breaks = c("pay", "median_after_tax"),
labels = c("Pay", "Median UK"))Use a focus palette to bring attention to one particular series, while keeping the other data there for comparison.
focus = c("#12436D", "#BFBFBF", "#BFBFBF") # contrast ratio of 5.57:1 (passes Web Content Accessibility
# Guidelines [WCAG]).
# Add as many grey colours as you need.
countries %>%
filter(str_detect(Name, "United")) %>%
ggplot(aes(x=Year, y=GDP)) +
geom_line(aes(colour = Name), size = 0.8) +
scale_colour_manual(values = focus,
breaks = c("United Kingdom", "United Arab Emirates", "United States"),
labels = c("UK", "UAE", "US"),
name = "Country")Alternatively, use gghighlight:
countries %>%
filter(str_detect(Name, "Republic")) %>%
ggplot(aes(x=Year, y=GDP)) +
geom_line(aes(colour = Name), size = 0.8) +
gghighlight(str_detect(Name, "Czech"),
label_params = list(size = 3),
label_key = Code)9.2 Brewer
scale_*_brewer to use predefined palettes:
ggplot2::diamonds %>%
filter(price <6000) %>%
ggplot(aes(x = price, fill = cut)) +
geom_histogram(position = "dodge", binwidth = 1000) +
scale_fill_brewer(palette = "YlOrRd")# Categorical palette
# https://analysisfunction.civilservice.gov.uk/support/communicating-analysis/introduction-to-data-visualisation-e-learning/data-visualisation-e-learning-module-3-accessibility-and-colour-palettes/
# note: recommendation is to use a max of 4 colours
categorical = c("#12436D", "#28A197", "#801650", "#F46A25", "#3D3D3D", "#A285D1")
show_col(categorical)# example
ggplot2::midwest %>%
filter(county %in% c("ADAMS", "ALEXANDER", "BOND", "BOONE", "BROWN")) %>%
ggplot(aes(x=county, y=poptotal, fill = county)) +
geom_col(position = position_dodge()) +
scale_fill_brewer(type = "seq", palette = "Set1")10 Odd bits for sorting later
geom_hline
geom_vline11 Combining and saving plots
11.1 Facetting
To facet by a single variable, use facet_wrap. Add scales = "free" if axes are different for each group.
palmerpenguins::penguins %>%
ggplot(aes(x = bill_length_mm, y = flipper_length_mm)) +
geom_point(aes(colour = island)) +
facet_wrap(~ species,
nrow = 3,
scales = "fixed") +
theme_bw()To facet by two variables, use facet_grid
palmerpenguins::penguins %>%
drop_na(sex) %>%
ggplot(aes(x=bill_length_mm, y=flipper_length_mm)) +
geom_point() +
facet_grid(species ~ sex) +
theme_bw()11.2 Arranging
Use ggarrange from the ggpubr package to arrange plots.
# plot 1
a = gcookbook::countries %>%
filter(Name == "United Kingdom") %>%
ggplot(aes(x = Year, y = GDP)) +
geom_line() +
scale_y_continuous(labels = label_comma(big.mark = ",")) +
theme_bw()
# plot 2
b = gcookbook::countries %>%
filter(Name == "United Kingdom") %>%
ggplot(aes(x = Year, y = healthexp)) +
geom_line() +
scale_y_continuous(labels = label_comma(big.mark = ",")) +
theme_bw()
# combine
ggarrange(a, b,
ncol=2,
widths = c(1,1),
labels = c("A", "B"))You can also nest ggarrange calls, e.g. if you have 3 or more plots
c = gcookbook::countries %>%
filter(Name == "United Kingdom") %>%
ggplot(aes(x = Year, y = infmortality)) +
geom_line() +
theme_bw()
ggarrange(a,
ggarrange(b, c, ncol = 2),
nrow = 2)It’s also possible to use a shared legend and title (if applicable to all plots)
a = gcookbook::countries %>%
filter(str_detect(Name, "United")) %>%
ggplot(aes(x = Year, y = GDP)) +
geom_line(aes(colour = Name, linetype = Name), linewidth = 0.8) +
theme_bw()
# plot 2
b = gcookbook::countries %>%
filter(str_detect(Name, "United")) %>%
ggplot(aes(x = Year, y = healthexp)) +
geom_line(aes(colour = Name, linetype = Name), linewidth = 0.8) +
theme_bw()
# wrap in annotate_figure for common title
annotate_figure(ggarrange(a, b, ncol=2,
common.legend = T,
legend = "bottom"),
top = "Common title",
fig.lab.pos = "top",
fig.lab.size = 14)11.3 Saving
my_plot = ggplot(...)
ppi = 300 # pixels per inch
png("file_name.png", width = 4*ppi, height = 4*ppi, res = ppi)
my_plot
dev.off()